Loads the airway package
## ----loadairway------------------------------------------------------------
library("airway")
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
Sets the variable “dir” to the pathway where the airway package is locally stored
## ----dir-------------------------------------------------------------------
dir <- system.file("extdata", package="airway", mustWork=TRUE)
Lists all of the files and directories in the dir and then specifies for just the quant directories
## ----list.files------------------------------------------------------------
list.files(dir)
## [1] "GSE52778_series_matrix.txt" "Homo_sapiens.GRCh37.75_subset.gtf"
## [3] "quants" "sample_table.csv"
## [5] "SraRunInfo_SRP033351.csv" "SRR1039508_subset.bam"
## [7] "SRR1039509_subset.bam" "SRR1039512_subset.bam"
## [9] "SRR1039513_subset.bam" "SRR1039516_subset.bam"
## [11] "SRR1039517_subset.bam" "SRR1039520_subset.bam"
## [13] "SRR1039521_subset.bam"
list.files(file.path(dir, "quants"))
## [1] "SRR1039508" "SRR1039509"
A comma separated values table is assigned to the variable “csvfile” and the first row of the file is extracted and shown.
## ----sampleinfo------------------------------------------------------------
csvfile <- file.path(dir, "sample_table.csv")
coldata <- read.csv(csvfile, row.names=1, stringsAsFactors=FALSE)
coldata
## SampleName cell dex albut Run avgLength Experiment
## SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345
## SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346
## SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349
## SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350
## SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120 SRX384353
## SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354
## SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101 SRX384357
## SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358
## Sample BioSample
## SRR1039508 SRS508568 SAMN02422669
## SRR1039509 SRS508567 SAMN02422675
## SRR1039512 SRS508571 SAMN02422678
## SRR1039513 SRS508572 SAMN02422670
## SRR1039516 SRS508575 SAMN02422682
## SRR1039517 SRS508576 SAMN02422673
## SRR1039520 SRS508579 SAMN02422683
## SRR1039521 SRS508580 SAMN02422677
A coldata table is created with two columns, names and files, and is populated with the paths to the quant files from the previous section.
## ----makecoldata-----------------------------------------------------------
coldata <- coldata[1:2,]
coldata$names <- coldata$Run
coldata$files <- file.path(dir, "quants", coldata$names, "quant.sf.gz")
file.exists(coldata$files)
## [1] TRUE TRUE
The “dim” call checks the dimensions and the “head” call displays the first lines of the gene object.
## ----lookgse---------------------------------------------------------------
dim(gse)
## [1] 58294 2
head(rownames(gse))
## [1] "ENSG00000000003.14" "ENSG00000000005.5" "ENSG00000000419.12"
## [4] "ENSG00000000457.13" "ENSG00000000460.16" "ENSG00000000938.12"
A graph is created with a plotting method to visualize the intersection of the colData object and the range of rows.
## ----sumexp, echo=FALSE----------------------------------------------------
par(mar=c(0,0,0,0))
plot(1,1,xlim=c(0,100),ylim=c(0,100),bty="n",
type="n",xlab="",ylab="",xaxt="n",yaxt="n")
polygon(c(45,90,90,45),c(5,5,70,70),col="pink",border=NA)
polygon(c(45,90,90,45),c(68,68,70,70),col="pink3",border=NA)
text(67.5,40,"assay(s)")
text(67.5,35,'e.g. "counts", ...')
polygon(c(10,40,40,10),c(5,5,70,70),col="skyblue",border=NA)
polygon(c(10,40,40,10),c(68,68,70,70),col="skyblue3",border=NA)
text(25,40,"rowRanges")
polygon(c(45,90,90,45),c(75,75,95,95),col="palegreen",border=NA)
polygon(c(45,47,47,45),c(75,75,95,95),col="palegreen3",border=NA)
text(67.5,85,"colData")

The data call loads the full gse object.
## ----loadfullgse-----------------------------------------------------------
data(gse)
gse
## class: RangedSummarizedExperiment
## dim: 58294 8
## metadata(6): tximetaInfo quantInfo ... txomeInfo txdbInfo
## assays(3): counts abundance length
## rownames(58294): ENSG00000000003.14 ENSG00000000005.5 ...
## ENSG00000285993.1 ENSG00000285994.1
## rowData names(1): gene_id
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(3): names donor condition
The “assayNames” call displays the titles of each column, while “head” shows the first few lines of the object and the “colSums” shows the sum of each column in the object.
## ----assaysgse-------------------------------------------------------------
assayNames(gse)
## [1] "counts" "abundance" "length"
head(assay(gse), 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 708.164 467.962 900.992 424.368 1188.295
## ENSG00000000005.5 0.000 0.000 0.000 0.000 0.000
## ENSG00000000419.12 455.000 510.000 604.000 352.000 583.000
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 1090.668 805.929 599.337
## ENSG00000000005.5 0.000 0.000 0.000
## ENSG00000000419.12 773.999 409.999 499.000
colSums(assay(gse))
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520
## 21100805 19298584 26145537 15688246 25268618 31891456 19683767
## SRR1039521
## 21813903
Shows the ranges for the first and final five genes.
## ----rowrangesgse----------------------------------------------------------
rowRanges(gse)
## GRanges object with 58294 ranges and 1 metadata column:
## seqnames ranges strand | gene_id
## <Rle> <IRanges> <Rle> | <character>
## ENSG00000000003.14 chrX 100627109-100639991 - | ENSG00000000003.14
## ENSG00000000005.5 chrX 100584802-100599885 + | ENSG00000000005.5
## ENSG00000000419.12 chr20 50934867-50958555 - | ENSG00000000419.12
## ENSG00000000457.13 chr1 169849631-169894267 - | ENSG00000000457.13
## ENSG00000000460.16 chr1 169662007-169854080 + | ENSG00000000460.16
## ... ... ... ... . ...
## ENSG00000285990.1 chr14 19244904-19269380 - | ENSG00000285990.1
## ENSG00000285991.1 chr6 149817937-149896011 - | ENSG00000285991.1
## ENSG00000285992.1 chr8 47129262-47132628 + | ENSG00000285992.1
## ENSG00000285993.1 chr18 46409197-46410645 - | ENSG00000285993.1
## ENSG00000285994.1 chr10 12563151-12567351 + | ENSG00000285994.1
## -------
## seqinfo: 25 sequences (1 circular) from hg38 genome
The “gse\(donor" and "gse\)condition” displays each of the parameters in the object.
## ----gsevars---------------------------------------------------------------
gse$donor
## [1] N61311 N61311 N052611 N052611 N080611 N080611 N061011 N061011
## Levels: N052611 N061011 N080611 N61311
gse$condition
## [1] Untreated Dexamethasone Untreated Dexamethasone Untreated
## [6] Dexamethasone Untreated Dexamethasone
## Levels: Untreated Dexamethasone
Each of these renames these parameters.
## ----gsevarsrename---------------------------------------------------------
gse$cell <- gse$donor
gse$dex <- gse$condition
The “levels(gse$dex) <- c(”untrt“,”trt“)” call casts the names of the parameters to the new vector.
## ----renamelevels----------------------------------------------------------
levels(gse$dex)
## [1] "Untreated" "Dexamethasone"
# when renaming levels, the order must be preserved!
levels(gse$dex) <- c("untrt", "trt")
This loads the “magrittr” package and executes a pipe with the relevel function.
## ----gsedex----------------------------------------------------------------
library("magrittr")
gse$dex %<>% relevel("untrt")
gse$dex
## [1] untrt trt untrt trt untrt trt untrt trt
## Levels: untrt trt
This line is not executed, but is the equilvalent to the previous line of code.
## ----explaincmpass, eval = FALSE-------------------------------------------
# gse$dex <- relevel(gse$dex, "untrt")
The “round” call displays the fragments that can be mapped to the genes.
## ----countreads------------------------------------------------------------
round( colSums(assay(gse)) / 1e6, 1 )
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520
## 21.1 19.3 26.1 15.7 25.3 31.9 19.7
## SRR1039521
## 21.8
The "DESeq2 package is loaded.
## ----loaddeseq2------------------------------------------------------------
library("DESeq2")
This call creates a fully annotated dds object for the gene analysis.
## ----makedds---------------------------------------------------------------
dds <- DESeqDataSet(gse, design = ~ cell + dex)
## using counts and average transcript lengths from tximeta
This creates a countdata object that creates an assay with the counts from the gse object. The “head” call displays the first lines of the countdata object.
## --------------------------------------------------------------------------
countdata <- round(assays(gse)[["counts"]])
head(countdata, 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 708 468 901 424 1188
## ENSG00000000005.5 0 0 0 0 0
## ENSG00000000419.12 455 510 604 352 583
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 1091 806 599
## ENSG00000000005.5 0 0 0
## ENSG00000000419.12 774 410 499
The coldata object is recast to hold the data from the gse object.
## --------------------------------------------------------------------------
coldata <- colData(gse)
An object, “ddsMat”, is created to hold the colData and countData objects.
## --------------------------------------------------------------------------
ddsMat <- DESeqDataSetFromMatrix(countData = countdata,
colData = coldata,
design = ~ cell + dex)
## converting counts to integer mode
This block is used to filter out the rows that contain one or fewer counts to isolate the data rows that have useful data.
## --------------------------------------------------------------------------
nrow(dds)
## [1] 58294
keep <- rowSums(counts(dds)) > 1
dds <- dds[keep,]
nrow(dds)
## [1] 31604
This goes to continue filtering the dataset to isolate rows with a higher data density.
## --------------------------------------------------------------------------
# at least 3 samples with a count of 10 or higher
keep <- rowSums(counts(dds) >= 10) >= 3
The output of this graph is used to show how the variance in the data grwos as the average does.
## ----meanSdCts-------------------------------------------------------------
lambda <- 10^seq(from = -1, to = 2, length = 1000)
cts <- matrix(rpois(1000*100, lambda), ncol = 100)
library("vsn")
meanSdPlot(cts, ranks = FALSE)

This shows the raw values used to create the first graph.
## ----vst-------------------------------------------------------------------
vsd <- vst(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(vsd), 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 10.105781 9.852029 10.169726 9.991545 10.424865
## ENSG00000000419.12 9.692244 9.923647 9.801921 9.798653 9.763455
## ENSG00000000457.13 9.449592 9.312186 9.362754 9.459168 9.281415
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 10.194490 10.315814 10.002177
## ENSG00000000419.12 9.874703 9.683211 9.845507
## ENSG00000000457.13 9.395937 9.477971 9.477027
colData(vsd)
## DataFrame with 8 rows and 5 columns
## names donor condition cell dex
## <factor> <factor> <factor> <factor> <factor>
## SRR1039508 SRR1039508 N61311 Untreated N61311 untrt
## SRR1039509 SRR1039509 N61311 Dexamethasone N61311 trt
## SRR1039512 SRR1039512 N052611 Untreated N052611 untrt
## SRR1039513 SRR1039513 N052611 Dexamethasone N052611 trt
## SRR1039516 SRR1039516 N080611 Untreated N080611 untrt
## SRR1039517 SRR1039517 N080611 Dexamethasone N080611 trt
## SRR1039520 SRR1039520 N061011 Untreated N061011 untrt
## SRR1039521 SRR1039521 N061011 Dexamethasone N061011 trt
This shows the raw values used to create the second graph.
## ----rlog------------------------------------------------------------------
rld <- rlog(dds, blind = FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
head(assay(rld), 3)
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003.14 9.482613 9.172197 9.558383 9.346001 9.851349
## ENSG00000000419.12 8.860186 9.150196 9.000042 8.995902 8.951327
## ENSG00000000457.13 8.354790 8.166700 8.236582 8.366693 8.121781
## SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003.14 9.587602 9.727248 9.357876
## ENSG00000000419.12 9.091075 8.848782 9.054384
## ENSG00000000457.13 8.282307 8.392384 8.391023
The packages “dplyr” and “ggplot2” are loaded and the samples are plotted against eachother to show the differences.
## ----transformplot, fig.width = 6, fig.height = 2.5------------------------
library("dplyr")
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:AnnotationDbi':
##
## select
## The following object is masked from 'package:matrixStats':
##
## count
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:GenomicRanges':
##
## intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
##
## intersect
## The following objects are masked from 'package:IRanges':
##
## collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
##
## first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("ggplot2")
dds <- estimateSizeFactors(dds)
## using 'avgTxLength' from assays(dds), correcting for library size
df <- bind_rows(
as_data_frame(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
mutate(transformation = "log2(x + 1)"),
as_data_frame(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
as_data_frame(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
## Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
colnames(df)[1:2] <- c("x", "y")
ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
coord_fixed() + facet_grid( . ~ transformation)

This is used to determine the overall similarity between the samples.
## --------------------------------------------------------------------------
sampleDists <- dist(t(assay(vsd)))
sampleDists
## SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517
## SRR1039509 39.42362
## SRR1039512 32.37620 44.93748
## SRR1039513 51.09677 37.18799 41.79886
## SRR1039516 35.59642 47.54671 34.83458 52.05265
## SRR1039517 51.26314 41.58572 46.89609 40.67315 39.74268
## SRR1039520 32.38578 46.96000 30.35980 48.08669 37.07106 50.38349
## SRR1039521 51.49108 37.57383 47.17283 31.45899 52.62276 41.35941
## SRR1039520
## SRR1039509
## SRR1039512
## SRR1039513
## SRR1039516
## SRR1039517
## SRR1039520
## SRR1039521 43.01502
The packages “pheatmap” and “RColorBrewer” are loaded.
## --------------------------------------------------------------------------
library("pheatmap")
library("RColorBrewer")
This creates a heatmap to show the sample distance matrix.
## ----distheatmap, fig.width = 6.1, fig.height = 4.5------------------------
sampleDistMatrix <- as.matrix( sampleDists )
rownames(sampleDistMatrix) <- paste( vsd$dex, vsd$cell, sep = " - " )
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
col = colors)

The “PoiClaClu” package is loaded and used to do Poisson Distance analysis.
## --------------------------------------------------------------------------
library("PoiClaClu")
poisd <- PoissonDistance(t(counts(dds)))
The Poisson Distance analysis is used to create a heatmap for the sample distances. The Poisson distance matrix uses the raw data values instead of the normalized ones.
## ----poisdistheatmap, fig.width = 6.1, fig.height = 4.5--------------------
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( dds$dex, dds$cell, sep=" - " )
colnames(samplePoisDistMatrix) <- NULL
pheatmap(samplePoisDistMatrix,
clustering_distance_rows = poisd$dd,
clustering_distance_cols = poisd$dd,
col = colors)

A Principle Components Analysis plot is created with the to show variance.
## ----plotpca, fig.width=6, fig.height=4.5----------------------------------
plotPCA(vsd, intgroup = c("dex", "cell"))

This uses the ggplot2 package to begin working on the PCA plot from the vsd data.
## --------------------------------------------------------------------------
pcaData <- plotPCA(vsd, intgroup = c( "dex", "cell"), returnData = TRUE)
pcaData
## PC1 PC2 group dex cell name
## SRR1039508 -14.311369 -2.6000421 untrt:N61311 untrt N61311 SRR1039508
## SRR1039509 8.058574 -0.7500532 trt:N61311 trt N61311 SRR1039509
## SRR1039512 -9.404122 -4.3920761 untrt:N052611 untrt N052611 SRR1039512
## SRR1039513 14.497842 -4.1323833 trt:N052611 trt N052611 SRR1039513
## SRR1039516 -12.365055 11.2109581 untrt:N080611 untrt N080611 SRR1039516
## SRR1039517 9.343946 14.9115160 trt:N080611 trt N080611 SRR1039517
## SRR1039520 -10.852633 -7.7618618 untrt:N061011 untrt N061011 SRR1039520
## SRR1039521 15.032816 -6.4860576 trt:N061011 trt N061011 SRR1039521
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot2 is used to create the PCA plot with significantly more control voer the details.
## ----ggplotpca, fig.width=6, fig.height=4.5--------------------------------
ggplot(pcaData, aes(x = PC1, y = PC2, color = dex, shape = cell)) +
geom_point(size =3) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
coord_fixed() +
ggtitle("PCA with VST data")

The “glmpca” package is loaded and creates a generalized PCA plot.
## --------------------------------------------------------------------------
library("glmpca")
gpca <- glmpca(counts(dds), L=2)
gpca.dat <- gpca$factors
gpca.dat$dex <- dds$dex
gpca.dat$cell <- dds$cell
## ----glmpca, fig.width=6, fig.height=4.5-----------------------------------
ggplot(gpca.dat, aes(x = dim1, y = dim2, color = dex, shape = cell)) +
geom_point(size =3) + coord_fixed() + ggtitle("glmpca - Generalized PCA")

An MDS (Multidimensional Scaling) plot is best used with the matrixes of distances instead of the matrixes of data.
## ----mdsvst, fig.width=6, fig.height=4.5-----------------------------------
mds <- as.data.frame(colData(vsd)) %>%
cbind(cmdscale(sampleDistMatrix))
ggplot(mds, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed() + ggtitle("MDS with VST data")

A similar MDS plot can be created with the Poisson Distances.
## ----mdspois, fig.width=6, fig.height=4.5----------------------------------
mdsPois <- as.data.frame(colData(dds)) %>%
cbind(cmdscale(samplePoisDistMatrix))
ggplot(mdsPois, aes(x = `1`, y = `2`, color = dex, shape = cell)) +
geom_point(size = 3) + coord_fixed() + ggtitle("MDS with PoissonDistances")

This runs a differential expression pipeline on the raw counts.
## ----airwayDE--------------------------------------------------------------
dds <- DESeq(dds)
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
The results of the differential expression pipeline are shown here.
## --------------------------------------------------------------------------
res <- results(dds)
res
## log2 fold change (MLE): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 31604 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000000003.14 739.940717372053 -0.361153676816704 0.106868570706213
## ENSG00000000419.12 511.73572199559 0.206314742225697 0.128664534265382
## ENSG00000000457.13 314.194855097564 0.037830765702789 0.158633410116225
## ENSG00000000460.16 79.7936215457165 -0.115258993162199 0.314990674190846
## ENSG00000000938.12 0.307266835513683 -1.36911852155525 3.50376358863452
## ... ... ... ...
## ENSG00000285979.1 38.3538864147071 0.342365713888024 0.359510635836304
## ENSG00000285987.1 1.56250793343253 0.706414476454889 1.5472945399791
## ENSG00000285990.1 0.642314540619725 0.364733264633251 3.43327565299413
## ENSG00000285991.1 11.2762841129775 -0.116551517129412 0.748601160880117
## ENSG00000285994.1 3.65104108073599 -0.0960094332693204 1.0686598396981
## stat pvalue padj
## <numeric> <numeric> <numeric>
## ENSG00000000003.14 -3.37941898567666 0.000726392125305196 0.00531136916201091
## ENSG00000000419.12 1.60350902759384 0.108822317644649 0.293188699547079
## ENSG00000000457.13 0.238479180867837 0.811509460868018 0.92255697455246
## ENSG00000000460.16 -0.365912398702847 0.714430444201867 0.872980377910416
## ENSG00000000938.12 -0.390756535628255 0.695977205170472 NA
## ... ... ... ...
## ENSG00000285979.1 0.952310390182487 0.340939589953335 0.600750489158651
## ENSG00000285987.1 0.456548160807465 0.647995846834697 NA
## ENSG00000285990.1 0.106234774453712 0.915396081024322 NA
## ENSG00000285991.1 -0.155692407679924 0.876275481983982 0.952921308734717
## ENSG00000285994.1 -0.089840966884695 0.928413592935384 NA
The more controlled results could also have been achieved by directly specifying the results parameters.
## --------------------------------------------------------------------------
res <- results(dds, contrast=c("dex","trt","untrt"))
A summary of the “res” object is shown here.
## --------------------------------------------------------------------------
summary(res)
##
## out of 31604 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 2373, 7.5%
## LFC < 0 (down) : 1949, 6.2%
## outliers [1] : 0, 0%
## low counts [2] : 14706, 47%
## (mean count < 9)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
The false discovery rate threshold is lowered and the results function is updated to optimize filtration.
## --------------------------------------------------------------------------
res.05 <- results(dds, alpha = 0.05)
table(res.05$padj < 0.05)
##
## FALSE TRUE
## 13357 3541
This selects for genes that have significant changes by more than a factor of 2 (either 0.5 or 2).
## --------------------------------------------------------------------------
resLFC1 <- results(dds, lfcThreshold=1)
table(resLFC1$padj < 0.1)
##
## FALSE TRUE
## 16687 211
Here the results of the comparison between two levels are shown.
## --------------------------------------------------------------------------
results(dds, contrast = c("cell", "N061011", "N61311"))
## log2 fold change (MLE): cell N061011 vs N61311
## Wald test p-value: cell N061011 vs N61311
## DataFrame with 31604 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000000003.14 739.940717372053 0.270945073288714 0.152170652747696
## ENSG00000000419.12 511.73572199559 -0.0718309515716049 0.182816709211393
## ENSG00000000457.13 314.194855097564 0.17988057135378 0.225122028264875
## ENSG00000000460.16 79.7936215457165 -0.119482058987533 0.441593831219506
## ENSG00000000938.12 0.307266835513683 0 4.9975798104491
## ... ... ... ...
## ENSG00000285979.1 38.3538864147071 0.0589757126275811 0.512391427449622
## ENSG00000285987.1 1.56250793343253 1.02168042060967 2.20186060481305
## ENSG00000285990.1 0.642314540619725 -3.09564035666317 4.85271500898552
## ENSG00000285991.1 11.2762841129775 -0.877962796217153 1.04696318967239
## ENSG00000285994.1 3.65104108073599 -0.0192350857348909 1.5132364065081
## stat pvalue padj
## <numeric> <numeric> <numeric>
## ENSG00000000003.14 1.78053434349099 0.0749885539964457 0.37882768043907
## ENSG00000000419.12 -0.392912397786058 0.694384184390243 0.93670340928549
## ENSG00000000457.13 0.799035850646014 0.424269624416598 0.820732883668396
## ENSG00000000460.16 -0.270570036401033 0.786721743935893 0.960661528779669
## ENSG00000000938.12 0 1 NA
## ... ... ... ...
## ENSG00000285979.1 0.115098944806955 0.908366696268635 0.983709901493367
## ENSG00000285987.1 0.46400776614849 0.642642181542788 NA
## ENSG00000285990.1 -0.637919257762125 0.523526241080306 NA
## ENSG00000285991.1 -0.838580386471737 0.40170482075606 NA
## ENSG00000285994.1 -0.0127112232114989 0.989858184362336 NA
Multiple Testing methods are required to provide a P value that disproves the null.
## ----sumres----------------------------------------------------------------
sum(res$pvalue < 0.05, na.rm=TRUE)
## [1] 5170
sum(!is.na(res$pvalue))
## [1] 31604
Here we are looking for the number of genes with a less than 10% false positive rate.
## --------------------------------------------------------------------------
sum(res$padj < 0.1, na.rm=TRUE)
## [1] 4322
The genes are then arranged by strongest downregulation.
## --------------------------------------------------------------------------
resSig <- subset(res, padj < 0.1)
head(resSig[ order(resSig$log2FoldChange), ])
## log2 fold change (MLE): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000216490.3 42.3007106248874 -5.72482604043858 1.47565155349927
## ENSG00000267339.5 30.5206063861615 -5.39781111590036 0.773017198758288
## ENSG00000257542.5 10.0398869404764 -5.25991351125893 1.28200100388257
## ENSG00000146006.7 61.6448430808005 -4.49504194549369 0.663821044679586
## ENSG00000108700.4 14.6323552674909 -4.09068745839218 0.941842256424086
## ENSG00000213240.8 12.096158074824 -3.87312575409988 1.27413298748988
## stat pvalue padj
## <numeric> <numeric> <numeric>
## ENSG00000216490.3 -3.87952428665362 0.000104660951388532 0.000987852735620848
## ENSG00000267339.5 -6.98278269173179 2.89389866967638e-12 9.4586266383349e-11
## ENSG00000257542.5 -4.10289344183752 4.08015195589968e-05 0.000430645894758231
## ENSG00000146006.7 -6.77146646904417 1.27483509915645e-11 3.82631678606496e-10
## ENSG00000108700.4 -4.34328299722226 1.40369125072207e-05 0.00016668710298455
## ENSG00000213240.8 -3.03981279201488 0.00236725244353312 0.0144986704569854
They can also be arranged by the strongest upregulation.
## --------------------------------------------------------------------------
head(resSig[ order(resSig$log2FoldChange, decreasing = TRUE), ])
## log2 fold change (MLE): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000254692.1 62.2302142486739 10.2071429096331 3.37705826771585
## ENSG00000179593.15 67.0895286534166 9.50514812098004 1.07705493413338
## ENSG00000268173.3 46.4370434386407 8.40438119076897 3.38505617991746
## ENSG00000224712.12 35.5607475035812 7.16685536619654 2.16475512121352
## ENSG00000109906.13 438.193981971508 6.37749569519001 0.313810468763182
## ENSG00000257663.1 24.3946124843467 6.34758399422488 2.09531382411201
## stat pvalue padj
## <numeric> <numeric> <numeric>
## ENSG00000254692.1 3.0224953496396 0.00250699934752643 0.0152166935971629
## ENSG00000179593.15 8.82512843100996 1.09334265101889e-18 6.81745539369636e-17
## ENSG00000268173.3 2.48278927854424 0.0130358175653821 0.0594385443118797
## ENSG00000224712.12 3.3107002708828 0.000930628316571692 0.00655239887226186
## ENSG00000109906.13 20.3227627182916 8.08933590301556e-92 1.24266907353779e-88
## ENSG00000257663.1 3.02941923122899 0.00245024410719956 0.0149150666150786
The plotCounts method shows counts for the groupings of the genes.
## ----plotcounts------------------------------------------------------------
topGene <- rownames(res)[which.min(res$padj)]
plotCounts(dds, gene = topGene, intgroup=c("dex"))

The “ggbeeswarm” package is loaded and used along with ggplot2 to create a custom plotCount plot.
## ----ggplotcountsjitter, fig.width = 4, fig.height = 3---------------------
library("ggbeeswarm")
geneCounts <- plotCounts(dds, gene = topGene, intgroup = c("dex","cell"),
returnData = TRUE)
ggplot(geneCounts, aes(x = dex, y = count, color = cell)) +
scale_y_log10() + geom_beeswarm(cex = 3)

The normalized counts have connected lines to indicate cell lines impact.
## ----ggplotcountsgroup, fig.width = 4, fig.height = 3----------------------
ggplot(geneCounts, aes(x = dex, y = count, color = cell, group = cell)) +
scale_y_log10() + geom_point(size = 3) + geom_line()

The “apeglm” packge is loaded and a Minus Average plot is created.
## ----plotma----------------------------------------------------------------
library("apeglm")
resultsNames(dds)
## [1] "Intercept" "cell_N061011_vs_N052611"
## [3] "cell_N080611_vs_N052611" "cell_N61311_vs_N052611"
## [5] "dex_trt_vs_untrt"
res <- lfcShrink(dds, coef="dex_trt_vs_untrt", type="apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
plotMA(res, ylim = c(-5, 5))

This is the plot that lacks the log fold change reduction.
## ----plotmaNoShr-----------------------------------------------------------
res.noshr <- results(dds, name="dex_trt_vs_untrt")
plotMA(res.noshr, ylim = c(-5, 5))

This shows how to select specific rows or regions of the graph.
## ----plotmalabel-----------------------------------------------------------
plotMA(res, ylim = c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
})

The histogram shows the p value frequencies, and is most effective when smallest values are excluded.
## ----histpvalue2-----------------------------------------------------------
hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20,
col = "grey50", border = "white")

The “genefilter” package is loaded and used to show gene clustering.
## --------------------------------------------------------------------------
library("genefilter")
##
## Attaching package: 'genefilter'
## The following objects are masked from 'package:matrixStats':
##
## rowSds, rowVars
topVarGenes <- head(order(rowVars(assay(vsd)), decreasing = TRUE), 20)
This heatmap shows deviation from the ave as opposed to expression strenght.
## ----genescluster----------------------------------------------------------
mat <- assay(vsd)[ topVarGenes, ]
mat <- mat - rowMeans(mat)
anno <- as.data.frame(colData(vsd)[, c("cell","dex")])
pheatmap(mat, annotation_col = anno)

New bins are created with the “quantile” function and the p values for each are calculated.
## ----sensitivityovermean, fig.width=6--------------------------------------
qs <- c(0, quantile(resLFC1$baseMean[resLFC1$baseMean > 0], 0:6/6))
bins <- cut(resLFC1$baseMean, qs)
levels(bins) <- paste0("~", round(signif((qs[-1] + qs[-length(qs)])/2, 2)))
fractionSig <- tapply(resLFC1$pvalue, bins, function(p)
mean(p < .05, na.rm = TRUE))
barplot(fractionSig, xlab = "mean normalized count",
ylab = "fraction of small p values")

## This line is not executed, but is similar to the previous block of code.
## ---- eval=FALSE-----------------------------------------------------------
# library("IHW")
# res.ihw <- results(dds, filterFun=ihw)
The “AnnotationDbi” and “org.Hs.eg.db” packages are loaded.
## --------------------------------------------------------------------------
library("AnnotationDbi")
library("org.Hs.eg.db")
##
This is a list of the available key types from the AnnotationDbi database.
## --------------------------------------------------------------------------
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
## [11] "GO" "GOALL" "IPI" "MAP" "OMIM"
## [16] "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" "PMID"
## [21] "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG" "UNIGENE"
## [26] "UNIPROT"
These are the external gene IDs.
## --------------------------------------------------------------------------
resOrdered <- res[order(res$pvalue),]
head(resOrdered)
## log2 fold change (MAP): dex trt vs untrt
## Wald test p-value: dex trt vs untrt
## DataFrame with 6 rows and 7 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSG00000189221.9 2373.80530666857 3.38765439100121 0.136984725340872
## ENSG00000120129.5 3420.72722668767 2.96334836319586 0.120849818517125
## ENSG00000101347.9 14125.5841210142 3.74129175571393 0.157926779740445
## ENSG00000196136.17 2710.21742603852 3.23518345105418 0.143950996111791
## ENSG00000152583.12 974.737366684742 4.48640613542985 0.201275651989011
## ENSG00000211445.11 12512.7915263227 3.75874563219025 0.169535830279915
## pvalue padj symbol
## <numeric> <numeric> <character>
## ENSG00000189221.9 1.84493931066867e-137 3.11757844716792e-133 MAOA
## ENSG00000120129.5 6.3504237592554e-135 5.36547303419488e-131 DUSP1
## ENSG00000101347.9 2.88983300388474e-127 1.62774660332148e-123 SAMHD1
## ENSG00000196136.17 3.68488166768093e-114 1.55667826051181e-110 SERPINA3
## ENSG00000152583.12 2.94551443785581e-113 9.9546605941775e-110 SPARCL1
## ENSG00000211445.11 2.36246306534271e-112 6.65348347969353e-109 GPX3
## entrez
## <character>
## ENSG00000189221.9 4128
## ENSG00000120129.5 1843
## ENSG00000101347.9 25939
## ENSG00000196136.17 12
## ENSG00000152583.12 8404
## ENSG00000211445.11 2878
This code is not evaluated.
## ----eval=FALSE------------------------------------------------------------
# resOrderedDF <- as.data.frame(resOrdered)[1:100, ]
# write.csv(resOrderedDF, file = "results.csv")
## ----eval=FALSE------------------------------------------------------------
# library("ReportingTools")
# htmlRep <- HTMLReport(shortName="report", title="My report",
# reportDirectory="./report")
# publish(resOrderedDF, htmlRep)
# url <- finish(htmlRep)
# browseURL(url)
This plots the differential expression results in the genomic space.
## --------------------------------------------------------------------------
resGR <- lfcShrink(dds, coef="dex_trt_vs_untrt", type="apeglm", format="GRanges")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resGR
## GRanges object with 31604 ranges and 5 metadata columns:
## seqnames ranges strand | baseMean
## <Rle> <IRanges> <Rle> | <numeric>
## ENSG00000000003.14 chrX 100627109-100639991 - | 739.940717372053
## ENSG00000000419.12 chr20 50934867-50958555 - | 511.73572199559
## ENSG00000000457.13 chr1 169849631-169894267 - | 314.194855097564
## ENSG00000000460.16 chr1 169662007-169854080 + | 79.7936215457165
## ENSG00000000938.12 chr1 27612064-27635277 - | 0.307266835513683
## ... ... ... ... . ...
## ENSG00000285979.1 chr16 57177349-57181390 + | 38.3538864147071
## ENSG00000285987.1 chr9 84316514-84657077 + | 1.56250793343253
## ENSG00000285990.1 chr14 19244904-19269380 - | 0.642314540619725
## ENSG00000285991.1 chr6 149817937-149896011 - | 11.2762841129775
## ENSG00000285994.1 chr10 12563151-12567351 + | 3.65104108073599
## log2FoldChange lfcSE
## <numeric> <numeric>
## ENSG00000000003.14 -0.336004620197314 0.105908878209215
## ENSG00000000419.12 0.178378944093913 0.122440452558955
## ENSG00000000457.13 0.0299360694683492 0.141095483475456
## ENSG00000000460.16 -0.0555061349569227 0.222786670799123
## ENSG00000000938.12 -0.0115799289067039 0.30473963728621
## ... ... ...
## ENSG00000285979.1 0.15284386124239 0.257069545767735
## ENSG00000285987.1 0.0255152677225242 0.300687312203615
## ENSG00000285990.1 -0.000185630381439551 0.304464769734699
## ENSG00000285991.1 -0.0150788219861866 0.283931479729372
## ENSG00000285994.1 -0.00684681225244801 0.293399449125489
## pvalue padj
## <numeric> <numeric>
## ENSG00000000003.14 0.000726392125305196 0.00531136916201091
## ENSG00000000419.12 0.108822317644649 0.293188699547079
## ENSG00000000457.13 0.811509460868018 0.92255697455246
## ENSG00000000460.16 0.714430444201867 0.872980377910416
## ENSG00000000938.12 0.695977205170472 <NA>
## ... ... ...
## ENSG00000285979.1 0.340939589953335 0.600750489158651
## ENSG00000285987.1 0.647995846834697 <NA>
## ENSG00000285990.1 0.915396081024322 <NA>
## ENSG00000285991.1 0.876275481983982 0.952921308734717
## ENSG00000285994.1 0.928413592935384 <NA>
## -------
## seqinfo: 25 sequences (1 circular) from hg38 genome
Symbols are added to include gene labels on the plot
## --------------------------------------------------------------------------
ens.str <- substr(names(resGR), 1, 15)
resGR$symbol <- mapIds(org.Hs.eg.db, ens.str, "SYMBOL", "ENSEMBL")
## 'select()' returned 1:many mapping between keys and columns
The “Gviz” package is loaded.
## --------------------------------------------------------------------------
library("Gviz")
## Loading required package: grid
A million base pairs window on either side of the gene with the smallest p value is set to collect a subset of genes within that window.
## --------------------------------------------------------------------------
window <- resGR[topGene] + 1e6
strand(window) <- "*"
resGRsub <- resGR[resGR %over% window]
naOrDup <- is.na(resGRsub$symbol) | duplicated(resGRsub$symbol)
resGRsub$group <- ifelse(naOrDup, names(resGRsub), resGRsub$symbol)
A vector identifying if the “padj” value of the gene was low.
## --------------------------------------------------------------------------
status <- factor(ifelse(resGRsub$padj < 0.05 & !is.na(resGRsub$padj),
"sig", "notsig"))
A plot showing the locations of each gene on is created with functions from the “Gviz” package.
## ----gvizplot--------------------------------------------------------------
options(ucscChromosomeNames = FALSE)
g <- GenomeAxisTrack()
a <- AnnotationTrack(resGRsub, name = "gene ranges", feature = status)
d <- DataTrack(resGRsub, data = "log2FoldChange", baseline = 0,
type = "h", name = "log2 fold change", strand = "+")
plotTracks(list(g, d, a), groupAnnotation = "group",
notsig = "grey", sig = "hotpink")

The “sva” package is loaded.
## --------------------------------------------------------------------------
library("sva")
## Loading required package: mgcv
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## The following object is masked from 'package:IRanges':
##
## collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
A matrix with the normalized counts is created for samples where the average count was greater than 1.
## --------------------------------------------------------------------------
dat <- counts(dds, normalized = TRUE)
idx <- rowMeans(dat) > 1
dat <- dat[idx, ]
mod <- model.matrix(~ dex, colData(dds))
mod0 <- model.matrix(~ 1, colData(dds))
svseq <- svaseq(dat, mod, mod0, n.sv = 2)
## Number of significant surrogate variables is: 2
## Iteration (out of 5 ):1 2 3 4 5
svseq$sv
## [,1] [,2]
## [1,] 0.2465669 -0.51599084
## [2,] 0.2588137 -0.59462876
## [3,] 0.1384516 0.24920662
## [4,] 0.2179075 0.37716083
## [5,] -0.6042910 -0.06305844
## [6,] -0.6138795 -0.03623320
## [7,] 0.1821306 0.30328185
## [8,] 0.1743002 0.28026195
This is used to check how well the “SVA” method recovered the sources of variation.
## ----svaplot---------------------------------------------------------------
par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
stripchart(svseq$sv[, i] ~ dds$cell, vertical = TRUE, main = paste0("SV", i))
abline(h = 0)
}

This is used to prevent the “SVA” method from impacting the counts.
## --------------------------------------------------------------------------
ddssva <- dds
ddssva$SV1 <- svseq$sv[,1]
ddssva$SV2 <- svseq$sv[,2]
design(ddssva) <- ~ SV1 + SV2 + dex
The “RUVSeq” package is loaded.
## --------------------------------------------------------------------------
library("RUVSeq")
## Loading required package: EDASeq
## Loading required package: ShortRead
## Loading required package: Biostrings
## Loading required package: XVector
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
## Loading required package: Rsamtools
## Loading required package: GenomicAlignments
##
## Attaching package: 'GenomicAlignments'
## The following object is masked from 'package:dplyr':
##
## last
##
## Attaching package: 'ShortRead'
## The following objects are masked from 'package:Gviz':
##
## chromosome, position
## The following object is masked from 'package:dplyr':
##
## id
## The following object is masked from 'package:magrittr':
##
## functions
## Loading required package: edgeR
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
##
## plotMA
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
Usng the “RUVg” method, it’s possible to estimate the factors of unwanted variation, which is similar to the “SVA” surrogate variables.
## --------------------------------------------------------------------------
set <- newSeqExpressionSet(counts(dds))
idx <- rowSums(counts(set) > 5) >= 2
set <- set[idx, ]
set <- betweenLaneNormalization(set, which="upper")
not.sig <- rownames(res)[which(res$pvalue > .1)]
empirical <- rownames(set)[ rownames(set) %in% not.sig ]
set <- RUVg(set, empirical, k=2)
pData(set)
## W_1 W_2
## SRR1039508 -0.224881168 0.42992983
## SRR1039509 -0.249022928 0.53858506
## SRR1039512 0.001460949 0.01437385
## SRR1039513 -0.175547525 0.08408354
## SRR1039516 0.599387535 -0.02512358
## SRR1039517 0.590516825 -0.02549392
## SRR1039520 -0.241071948 -0.50369551
## SRR1039521 -0.300841739 -0.51265927
This shows how the factors can be estimated by “RUV”.
## ----ruvplot---------------------------------------------------------------
par(mfrow = c(2, 1), mar = c(3,5,3,1))
for (i in 1:2) {
stripchart(pData(set)[, i] ~ dds$cell, vertical = TRUE, main = paste0("W", i))
abline(h = 0)
}

It is possible to control these factors by defining them.
## --------------------------------------------------------------------------
ddsruv <- dds
ddsruv$W1 <- set$W_1
ddsruv$W2 <- set$W_2
design(ddsruv) <- ~ W1 + W2 + dex
The “fission” package is loaded, and a basic time course analysis is run.
## --------------------------------------------------------------------------
library("fission")
data("fission")
ddsTC <- DESeqDataSet(fission, ~ strain + minute + strain:minute)
This is a likelihood ratio test where the strain specific differences are removed over time.
## ----fissionDE-------------------------------------------------------------
ddsTC <- DESeq(ddsTC, test="LRT", reduced = ~ strain + minute)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resTC <- results(ddsTC)
resTC$symbol <- mcols(ddsTC)$symbol
head(resTC[order(resTC$padj),], 4)
## log2 fold change (MLE): strainmut.minute180
## LRT p-value: '~ strain + minute + strain:minute' vs '~ strain + minute'
## DataFrame with 4 rows and 7 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## SPBC2F12.09c 174.671161802578 -2.65671953167536 0.752261272510079
## SPAC1002.18 444.504950375698 -0.0509321395113243 0.20429948485572
## SPAC1002.19 336.373206596558 -0.392748982380861 0.573494009173123
## SPAC1002.17c 261.773132733438 -1.13876476912802 0.60612875682789
## stat pvalue padj
## <numeric> <numeric> <numeric>
## SPBC2F12.09c 97.283386406583 1.97415107461335e-19 1.33452612643862e-15
## SPAC1002.18 56.953598622801 5.16955159226891e-11 1.74730843818689e-07
## SPAC1002.19 43.5339085770392 2.87980357353034e-08 6.48915738568838e-05
## SPAC1002.17c 39.3158355351511 2.05137078380187e-07 0.000346681662462517
## symbol
## <character>
## SPBC2F12.09c atf21
## SPAC1002.18 urg3
## SPAC1002.19 urg1
## SPAC1002.17c urg2
ggplot2 is used to show the counts for each gorup over a time frame.
## ----fissioncounts, fig.width=6, fig.height=4.5----------------------------
fiss <- plotCounts(ddsTC, which.min(resTC$padj),
intgroup = c("minute","strain"), returnData = TRUE)
fiss$minute <- as.numeric(as.character(fiss$minute))
ggplot(fiss,
aes(x = minute, y = count, color = strain, group = strain)) +
geom_point() + stat_summary(fun.y=mean, geom="line") +
scale_y_log10()

A “Wald” test for the log2 fold changes at individual time points.
## --------------------------------------------------------------------------
resultsNames(ddsTC)
## [1] "Intercept" "strain_mut_vs_wt" "minute_15_vs_0"
## [4] "minute_30_vs_0" "minute_60_vs_0" "minute_120_vs_0"
## [7] "minute_180_vs_0" "strainmut.minute15" "strainmut.minute30"
## [10] "strainmut.minute60" "strainmut.minute120" "strainmut.minute180"
res30 <- results(ddsTC, name="strainmut.minute30", test="Wald")
res30[which.min(resTC$padj),]
## log2 fold change (MLE): strainmut.minute30
## Wald test p-value: strainmut.minute30
## DataFrame with 1 row and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## SPBC2F12.09c 174.671161802578 -2.60046902875453 0.634342916343005
## stat pvalue padj
## <numeric> <numeric> <numeric>
## SPBC2F12.09c -4.09946885471073 4.14099389595899e-05 0.279931187366828
Cluster significant genes can be shown by their profiles.
## --------------------------------------------------------------------------
betas <- coef(ddsTC)
colnames(betas)
## [1] "Intercept" "strain_mut_vs_wt" "minute_15_vs_0"
## [4] "minute_30_vs_0" "minute_60_vs_0" "minute_120_vs_0"
## [7] "minute_180_vs_0" "strainmut.minute15" "strainmut.minute30"
## [10] "strainmut.minute60" "strainmut.minute120" "strainmut.minute180"
The log2 fold changes are plotted to a heatmap.
## ----fissionheatmap--------------------------------------------------------
topGenes <- head(order(resTC$padj),20)
mat <- betas[topGenes, -c(1,2)]
thr <- 3
mat[mat < -thr] <- -thr
mat[mat > thr] <- thr
pheatmap(mat, breaks=seq(from=-thr, to=thr, length=101),
cluster_col=FALSE)
